* (bug 2271) Use faster text-only link replacement in image alt text
instead of rerunning expensive link lookup and HTML generation.
* Only build the HTML attribute whitelist tree once.
+* Replace wfMungeToUtf8 and do_html_entity_decode with a single function
+ that does both numeric and named chars: Sanitizer::decodeCharReferences
+* Removed some obsolete UTF-8 converter functions
=== Caveats ===
* @access private
*/
function sectionAnchor( $text ) {
- global $wgInputEncoding;
- $headline = do_html_entity_decode( $text, ENT_COMPAT, $wgInputEncoding );
+ $headline = Sanitizer::decodeCharReferences( $text );
# strip out HTML
$headline = preg_replace( '/<.*?' . '>/', '', $headline );
$headline = trim( $headline );
}
}
-/**
- * html_entity_decode exists in PHP 4.3.0+ but is FATALLY BROKEN even then,
- * with no UTF-8 support.
- *
- * @param string $string String having html entities
- * @param $quote_style the quote style to pass as the second argument to
- * get_html_translation_table()
- * @param string $charset Encoding set to use (default 'UTF-8')
- */
-function do_html_entity_decode( $string, $quote_style=ENT_COMPAT, $charset='UTF-8' ) {
- $fname = 'do_html_entity_decode';
- wfProfileIn( $fname );
-
- static $trans;
- static $savedCharset;
- static $regexp;
- if( !isset( $trans ) || $savedCharset != $charset ) {
- $trans = array_flip( get_html_translation_table( HTML_ENTITIES, $quote_style ) );
- $savedCharset = $charset;
-
- # Note - mixing latin1 named entities and unicode numbered
- # ones will result in a bad link.
- if( strcasecmp( 'utf-8', $charset ) == 0 ) {
- $trans = array_map( 'utf8_encode', $trans );
- }
-
- /**
- * Most links will _not_ contain these fun guys,
- * and on long pages with many links we can get
- * called a lot.
- *
- * A regular expression search is faster than
- * a strtr or str_replace with a hundred-ish
- * entries, though it may be slower to actually
- * replace things.
- *
- * They all look like '&xxxx;'...
- */
- foreach( $trans as $key => $val ) {
- $snip[] = substr( $key, 1, -1 );
- }
- $regexp = '/(&(?:' . implode( '|', $snip ) . ');)/e';
- }
-
- $out = preg_replace( $regexp, '$trans["$1"]', $string );
- wfProfileOut( $fname );
- return $out;
-}
-
-
/**
* Where as we got a random seed
* @var bool $wgTotalViews
return $s;
}
-/**
- * Return the UTF-8 sequence for a given Unicode code point.
- * Doesn't work for values outside the Basic Multilingual Plane.
- *
- * @param string $codepoint UTF-8 code point.
- * @return string An UTF-8 character if the codepoint is in the BMP and
- * &#$codepoint if it isn't;
- */
-function wfUtf8Sequence( $codepoint ) {
- if($codepoint < 0x80)
- return chr($codepoint);
- if($codepoint < 0x800)
- return chr($codepoint >> 6 & 0x3f | 0xc0) . chr($codepoint & 0x3f | 0x80);
- if($codepoint < 0x10000)
- return chr($codepoint >> 12 & 0x0f | 0xe0) .
- chr($codepoint >> 6 & 0x3f | 0x80) .
- chr($codepoint & 0x3f | 0x80);
- if($codepoint < 0x110000)
- return chr($codepoint >> 18 & 0x07 | 0xf0) .
- chr($codepoint >> 12 & 0x3f | 0x80) .
- chr($codepoint >> 6 & 0x3f | 0x80) .
- chr($codepoint & 0x3f | 0x80);
- # There should be no assigned code points outside this range, but...
- return "&#$codepoint;";
-}
-
-/**
- * Converts numeric character entities to UTF-8
- *
- * @todo Do named entities
- *
- * @param string $string String to convert.
- * @return string Converted string.
- */
-function wfMungeToUtf8( $string ) {
- global $wgInputEncoding; # This is debatable
- #$string = iconv($wgInputEncoding, "UTF-8", $string);
- $string = preg_replace ( '/�*([0-9]+);/e', 'wfUtf8Sequence($1)', $string );
- $string = preg_replace ( '/&#x([0-9a-f]+);/ie', 'wfUtf8Sequence(0x$1)', $string );
- return $string;
-}
-
-/**
- * Converts a single UTF-8 character into the corresponding HTML character
- * entity (for use with preg_replace_callback)
- *
- * @param array $matches
- *
- */
-function wfUtf8Entity( $matches ) {
- $codepoint = utf8ToCodepoint( $matches[0] );
- return "&#$codepoint;";
-}
-
-/**
- * Converts all multi-byte characters in a UTF-8 string into the appropriate
- * character entity
- */
-function wfUtf8ToHTML($string) {
- return preg_replace_callback( '/[\\xc0-\\xfd][\\x80-\\xbf]*/', 'wfUtf8Entity', $string );
-}
-
/**
* Sends a line to the debug log if enabled or, optionally, to a comment in output.
* In normal operation this is a NOP.
* Pass a title object, not a title string
*/
function makeLinkObj( &$nt, $text= '', $query = '', $trail = '', $prefix = '' ) {
- global $wgOut, $wgUser, $wgInputEncoding;
+ global $wgOut, $wgUser;
$fname = 'Linker::makeLinkObj';
wfProfileIn( $fname );
$parts = explode( '#', $u, 2 );
if ( count( $parts ) == 2 ) {
- $anchor = urlencode( do_html_entity_decode( str_replace(' ', '_', $parts[1] ),
- ENT_COMPAT,
- $wgInputEncoding ) );
+ $anchor = urlencode( Sanitizer::decodeCharReferences( str_replace(' ', '_', $parts[1] ) ) );
$replacearray = array(
'%3A' => ':',
'%' => '.'
* Pass a title object, not a title string
*/
function makeKnownLinkObj( $nt, $text = '', $query = '', $trail = '', $prefix = '' , $aprops = '' ) {
- global $wgOut, $wgTitle, $wgInputEncoding;
+ global $wgOut, $wgTitle;
$fname = 'Linker::makeKnownLinkObj';
wfProfileIn( $fname );
$text = htmlspecialchars( $nt->getFragment() );
}
}
- $anchor = urlencode( do_html_entity_decode( str_replace(' ', '_', $nt->getFragment()), ENT_COMPAT, $wgInputEncoding ) );
+ $anchor = urlencode( Sanitizer::decodeCharReferences( str_replace( ' ', '_', $nt->getFragment() ) ) );
$replacearray = array(
'%3A' => ':',
'%' => '.'
* @access private
*/
function formatHeadings( $text, $isMain=true ) {
- global $wgInputEncoding, $wgMaxTocLevel, $wgContLang, $wgLinkHolders, $wgInterwikiLinkHolders;
+ global $wgMaxTocLevel, $wgContLang, $wgLinkHolders, $wgInterwikiLinkHolders;
$doNumberHeadings = $this->mOptions->getNumberHeadings();
$doShowToc = true;
# strip out HTML
$canonized_headline = preg_replace( '/<.*?' . '>/','',$canonized_headline );
$tocline = trim( $canonized_headline );
- $canonized_headline = urlencode( do_html_entity_decode( str_replace(' ', '_', $tocline), ENT_COMPAT, $wgInputEncoding ) );
+ $canonized_headline = urlencode( Sanitizer::decodeCharReferences( str_replace(' ', '_', $tocline) ) );
$replacearray = array(
'%3A' => ':',
'%' => '.'
* @subpackage Parser
*/
+/**
+ * Regular expression to match various types of character references in
+ * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
+ */
+define( 'MW_CHAR_REFS_REGEX',
+ '/&([A-Za-z0-9]+);
+ |&\#([0-9]+);
+ |&\#x([0-9A-Za-z]+);
+ |&\#X([0-9A-Za-z]+);
+ |(&)/x' );
+
+/**
+ * List of all named character entities defined in HTML 4.01
+ * http://www.w3.org/TR/html4/sgml/entities.html
+ * @access private
+ */
+global $wgHtmlEntities;
+$wgHtmlEntities = array(
+ 'Aacute' => 193,
+ 'aacute' => 225,
+ 'Acirc' => 194,
+ 'acirc' => 226,
+ 'acute' => 180,
+ 'AElig' => 198,
+ 'aelig' => 230,
+ 'Agrave' => 192,
+ 'agrave' => 224,
+ 'alefsym' => 8501,
+ 'Alpha' => 913,
+ 'alpha' => 945,
+ 'amp' => 38,
+ 'and' => 8743,
+ 'ang' => 8736,
+ 'Aring' => 197,
+ 'aring' => 229,
+ 'asymp' => 8776,
+ 'Atilde' => 195,
+ 'atilde' => 227,
+ 'Auml' => 196,
+ 'auml' => 228,
+ 'bdquo' => 8222,
+ 'Beta' => 914,
+ 'beta' => 946,
+ 'brvbar' => 166,
+ 'bull' => 8226,
+ 'cap' => 8745,
+ 'Ccedil' => 199,
+ 'ccedil' => 231,
+ 'cedil' => 184,
+ 'cent' => 162,
+ 'Chi' => 935,
+ 'chi' => 967,
+ 'circ' => 710,
+ 'clubs' => 9827,
+ 'cong' => 8773,
+ 'copy' => 169,
+ 'crarr' => 8629,
+ 'cup' => 8746,
+ 'curren' => 164,
+ 'dagger' => 8224,
+ 'Dagger' => 8225,
+ 'darr' => 8595,
+ 'dArr' => 8659,
+ 'deg' => 176,
+ 'Delta' => 916,
+ 'delta' => 948,
+ 'diams' => 9830,
+ 'divide' => 247,
+ 'Eacute' => 201,
+ 'eacute' => 233,
+ 'Ecirc' => 202,
+ 'ecirc' => 234,
+ 'Egrave' => 200,
+ 'egrave' => 232,
+ 'empty' => 8709,
+ 'emsp' => 8195,
+ 'ensp' => 8194,
+ 'Epsilon' => 917,
+ 'epsilon' => 949,
+ 'equiv' => 8801,
+ 'Eta' => 919,
+ 'eta' => 951,
+ 'ETH' => 208,
+ 'eth' => 240,
+ 'Euml' => 203,
+ 'euml' => 235,
+ 'euro' => 8364,
+ 'exist' => 8707,
+ 'fnof' => 402,
+ 'forall' => 8704,
+ 'frac12' => 189,
+ 'frac14' => 188,
+ 'frac34' => 190,
+ 'frasl' => 8260,
+ 'Gamma' => 915,
+ 'gamma' => 947,
+ 'ge' => 8805,
+ 'gt' => 62,
+ 'harr' => 8596,
+ 'hArr' => 8660,
+ 'hearts' => 9829,
+ 'hellip' => 8230,
+ 'Iacute' => 205,
+ 'iacute' => 237,
+ 'Icirc' => 206,
+ 'icirc' => 238,
+ 'iexcl' => 161,
+ 'Igrave' => 204,
+ 'igrave' => 236,
+ 'image' => 8465,
+ 'infin' => 8734,
+ 'int' => 8747,
+ 'Iota' => 921,
+ 'iota' => 953,
+ 'iquest' => 191,
+ 'isin' => 8712,
+ 'Iuml' => 207,
+ 'iuml' => 239,
+ 'Kappa' => 922,
+ 'kappa' => 954,
+ 'Lambda' => 923,
+ 'lambda' => 955,
+ 'lang' => 9001,
+ 'laquo' => 171,
+ 'larr' => 8592,
+ 'lArr' => 8656,
+ 'lceil' => 8968,
+ 'ldquo' => 8220,
+ 'le' => 8804,
+ 'lfloor' => 8970,
+ 'lowast' => 8727,
+ 'loz' => 9674,
+ 'lrm' => 8206,
+ 'lsaquo' => 8249,
+ 'lsquo' => 8216,
+ 'lt' => 60,
+ 'macr' => 175,
+ 'mdash' => 8212,
+ 'micro' => 181,
+ 'middot' => 183,
+ 'minus' => 8722,
+ 'Mu' => 924,
+ 'mu' => 956,
+ 'nabla' => 8711,
+ 'nbsp' => 160,
+ 'ndash' => 8211,
+ 'ne' => 8800,
+ 'ni' => 8715,
+ 'not' => 172,
+ 'notin' => 8713,
+ 'nsub' => 8836,
+ 'Ntilde' => 209,
+ 'ntilde' => 241,
+ 'Nu' => 925,
+ 'nu' => 957,
+ 'Oacute' => 211,
+ 'oacute' => 243,
+ 'Ocirc' => 212,
+ 'ocirc' => 244,
+ 'OElig' => 338,
+ 'oelig' => 339,
+ 'Ograve' => 210,
+ 'ograve' => 242,
+ 'oline' => 8254,
+ 'Omega' => 937,
+ 'omega' => 969,
+ 'Omicron' => 927,
+ 'omicron' => 959,
+ 'oplus' => 8853,
+ 'or' => 8744,
+ 'ordf' => 170,
+ 'ordm' => 186,
+ 'Oslash' => 216,
+ 'oslash' => 248,
+ 'Otilde' => 213,
+ 'otilde' => 245,
+ 'otimes' => 8855,
+ 'Ouml' => 214,
+ 'ouml' => 246,
+ 'para' => 182,
+ 'part' => 8706,
+ 'permil' => 8240,
+ 'perp' => 8869,
+ 'Phi' => 934,
+ 'phi' => 966,
+ 'Pi' => 928,
+ 'pi' => 960,
+ 'piv' => 982,
+ 'plusmn' => 177,
+ 'pound' => 163,
+ 'prime' => 8242,
+ 'Prime' => 8243,
+ 'prod' => 8719,
+ 'prop' => 8733,
+ 'Psi' => 936,
+ 'psi' => 968,
+ 'quot' => 34,
+ 'radic' => 8730,
+ 'rang' => 9002,
+ 'raquo' => 187,
+ 'rarr' => 8594,
+ 'rArr' => 8658,
+ 'rceil' => 8969,
+ 'rdquo' => 8221,
+ 'real' => 8476,
+ 'reg' => 174,
+ 'rfloor' => 8971,
+ 'Rho' => 929,
+ 'rho' => 961,
+ 'rlm' => 8207,
+ 'rsaquo' => 8250,
+ 'rsquo' => 8217,
+ 'sbquo' => 8218,
+ 'Scaron' => 352,
+ 'scaron' => 353,
+ 'sdot' => 8901,
+ 'sect' => 167,
+ 'shy' => 173,
+ 'Sigma' => 931,
+ 'sigma' => 963,
+ 'sigmaf' => 962,
+ 'sim' => 8764,
+ 'spades' => 9824,
+ 'sub' => 8834,
+ 'sube' => 8838,
+ 'sum' => 8721,
+ 'sup' => 8835,
+ 'sup1' => 185,
+ 'sup2' => 178,
+ 'sup3' => 179,
+ 'supe' => 8839,
+ 'szlig' => 223,
+ 'Tau' => 932,
+ 'tau' => 964,
+ 'there4' => 8756,
+ 'Theta' => 920,
+ 'theta' => 952,
+ 'thetasym' => 977,
+ 'thinsp' => 8201,
+ 'THORN' => 222,
+ 'thorn' => 254,
+ 'tilde' => 732,
+ 'times' => 215,
+ 'trade' => 8482,
+ 'Uacute' => 218,
+ 'uacute' => 250,
+ 'uarr' => 8593,
+ 'uArr' => 8657,
+ 'Ucirc' => 219,
+ 'ucirc' => 251,
+ 'Ugrave' => 217,
+ 'ugrave' => 249,
+ 'uml' => 168,
+ 'upsih' => 978,
+ 'Upsilon' => 933,
+ 'upsilon' => 965,
+ 'Uuml' => 220,
+ 'uuml' => 252,
+ 'weierp' => 8472,
+ 'Xi' => 926,
+ 'xi' => 958,
+ 'Yacute' => 221,
+ 'yacute' => 253,
+ 'yen' => 165,
+ 'Yuml' => 376,
+ 'yuml' => 255,
+ 'Zeta' => 918,
+ 'zeta' => 950,
+ 'zwj' => 8205,
+ 'zwnj' => 8204 );
+
class Sanitizer {
/**
* Cleans up HTML, removes dangerous tags and attributes, and
# http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
if( $attribute == 'style' && preg_match(
'/(expression|tps*:\/\/|url\\s*\().*/is',
- wfMungeToUtf8( $value ) ) ) {
+ Sanitizer::decodeCharReferences( $value ) ) ) {
# haxx0r
continue;
}
*/
function normalizeCharReferences( $text ) {
return preg_replace_callback(
- '/&([A-Za-z0-9]+);
- |&\#([0-9]+);
- |&\#x([0-9A-Za-z]+);
- |&\#X([0-9A-Za-z]+);
- |(&)/x',
+ MW_CHAR_REFS_REGEX,
array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
$text );
}
* @return string
*/
function normalizeEntity( $name ) {
- # List of all named character entities defined in HTML 4.01
- # http://www.w3.org/TR/html4/sgml/entities.html
- static $htmlEntities = array(
- 'aacute' => true,
- 'Aacute' => true,
- 'acirc' => true,
- 'Acirc' => true,
- 'acute' => true,
- 'aelig' => true,
- 'AElig' => true,
- 'agrave' => true,
- 'Agrave' => true,
- 'alefsym' => true,
- 'alpha' => true,
- 'Alpha' => true,
- 'amp' => true,
- 'and' => true,
- 'ang' => true,
- 'apos' => true,
- 'aring' => true,
- 'Aring' => true,
- 'asymp' => true,
- 'atilde' => true,
- 'Atilde' => true,
- 'auml' => true,
- 'Auml' => true,
- 'bdquo' => true,
- 'beta' => true,
- 'Beta' => true,
- 'brvbar' => true,
- 'bull' => true,
- 'cap' => true,
- 'ccedil' => true,
- 'Ccedil' => true,
- 'cedil' => true,
- 'cent' => true,
- 'chi' => true,
- 'Chi' => true,
- 'circ' => true,
- 'clubs' => true,
- 'cong' => true,
- 'copy' => true,
- 'crarr' => true,
- 'cup' => true,
- 'curren' => true,
- 'dagger' => true,
- 'Dagger' => true,
- 'darr' => true,
- 'dArr' => true,
- 'deg' => true,
- 'delta' => true,
- 'Delta' => true,
- 'diams' => true,
- 'divide' => true,
- 'eacute' => true,
- 'Eacute' => true,
- 'ecirc' => true,
- 'Ecirc' => true,
- 'egrave' => true,
- 'Egrave' => true,
- 'empty' => true,
- 'emsp' => true,
- 'ensp' => true,
- 'epsilon' => true,
- 'Epsilon' => true,
- 'equiv' => true,
- 'eta' => true,
- 'Eta' => true,
- 'eth' => true,
- 'ETH' => true,
- 'euml' => true,
- 'Euml' => true,
- 'euro' => true,
- 'exist' => true,
- 'fnof' => true,
- 'forall' => true,
- 'frac12' => true,
- 'frac14' => true,
- 'frac34' => true,
- 'frasl' => true,
- 'gamma' => true,
- 'Gamma' => true,
- 'ge' => true,
- 'gt' => true,
- 'harr' => true,
- 'hArr' => true,
- 'hearts' => true,
- 'hellip' => true,
- 'iacute' => true,
- 'Iacute' => true,
- 'icirc' => true,
- 'Icirc' => true,
- 'iexcl' => true,
- 'igrave' => true,
- 'Igrave' => true,
- 'image' => true,
- 'infin' => true,
- 'int' => true,
- 'iota' => true,
- 'Iota' => true,
- 'iquest' => true,
- 'isin' => true,
- 'iuml' => true,
- 'Iuml' => true,
- 'kappa' => true,
- 'Kappa' => true,
- 'lambda' => true,
- 'Lambda' => true,
- 'lang' => true,
- 'laquo' => true,
- 'larr' => true,
- 'lArr' => true,
- 'lceil' => true,
- 'ldquo' => true,
- 'le' => true,
- 'lfloor' => true,
- 'lowast' => true,
- 'loz' => true,
- 'lrm' => true,
- 'lsaquo' => true,
- 'lsquo' => true,
- 'lt' => true,
- 'macr' => true,
- 'mdash' => true,
- 'micro' => true,
- 'middot' => true,
- 'minus' => true,
- 'mu' => true,
- 'Mu' => true,
- 'nabla' => true,
- 'nbsp' => true,
- 'ndash' => true,
- 'ne' => true,
- 'ni' => true,
- 'not' => true,
- 'notin' => true,
- 'nsub' => true,
- 'ntilde' => true,
- 'Ntilde' => true,
- 'nu' => true,
- 'Nu' => true,
- 'oacute' => true,
- 'Oacute' => true,
- 'ocirc' => true,
- 'Ocirc' => true,
- 'oelig' => true,
- 'OElig' => true,
- 'ograve' => true,
- 'Ograve' => true,
- 'oline' => true,
- 'omega' => true,
- 'Omega' => true,
- 'omicron' => true,
- 'Omicron' => true,
- 'oplus' => true,
- 'or' => true,
- 'ordf' => true,
- 'ordm' => true,
- 'oslash' => true,
- 'Oslash' => true,
- 'otilde' => true,
- 'Otilde' => true,
- 'otimes' => true,
- 'ouml' => true,
- 'Ouml' => true,
- 'para' => true,
- 'part' => true,
- 'permil' => true,
- 'perp' => true,
- 'phi' => true,
- 'Phi' => true,
- 'pi' => true,
- 'Pi' => true,
- 'piv' => true,
- 'plusmn' => true,
- 'pound' => true,
- 'prime' => true,
- 'Prime' => true,
- 'prod' => true,
- 'prop' => true,
- 'psi' => true,
- 'Psi' => true,
- 'quot' => true,
- 'radic' => true,
- 'rang' => true,
- 'raquo' => true,
- 'rarr' => true,
- 'rArr' => true,
- 'rceil' => true,
- 'rdquo' => true,
- 'real' => true,
- 'reg' => true,
- 'rfloor' => true,
- 'rho' => true,
- 'Rho' => true,
- 'rlm' => true,
- 'rsaquo' => true,
- 'rsquo' => true,
- 'sbquo' => true,
- 'scaron' => true,
- 'Scaron' => true,
- 'sdot' => true,
- 'sect' => true,
- 'shy' => true,
- 'sigma' => true,
- 'Sigma' => true,
- 'sigmaf' => true,
- 'sim' => true,
- 'spades' => true,
- 'sub' => true,
- 'sube' => true,
- 'sum' => true,
- 'sup' => true,
- 'sup1' => true,
- 'sup2' => true,
- 'sup3' => true,
- 'supe' => true,
- 'szlig' => true,
- 'tau' => true,
- 'Tau' => true,
- 'there4' => true,
- 'theta' => true,
- 'Theta' => true,
- 'thetasym' => true,
- 'thinsp' => true,
- 'thorn' => true,
- 'THORN' => true,
- 'tilde' => true,
- 'times' => true,
- 'trade' => true,
- 'uacute' => true,
- 'Uacute' => true,
- 'uarr' => true,
- 'uArr' => true,
- 'ucirc' => true,
- 'Ucirc' => true,
- 'ugrave' => true,
- 'Ugrave' => true,
- 'uml' => true,
- 'upsih' => true,
- 'upsilon' => true,
- 'Upsilon' => true,
- 'uuml' => true,
- 'Uuml' => true,
- 'weierp' => true,
- 'xi' => true,
- 'Xi' => true,
- 'yacute' => true,
- 'Yacute' => true,
- 'yen' => true,
- 'yuml' => true,
- 'Yuml' => true,
- 'zeta' => true,
- 'Zeta' => true,
- 'zwj' => true,
- 'zwnj' => true );
- if( isset( $htmlEntities[$name] ) ) {
+ global $wgHtmlEntities;
+ if( isset( $wgHtmlEntities[$name] ) ) {
return "&$name;";
} else {
return "&$name;";
|| ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
}
+ /**
+ * Decode any character references, numeric or named entities,
+ * in the text and return a UTF-8 string.
+ *
+ * @param string $text
+ * @return string
+ * @access public
+ */
+ function decodeCharReferences( $text ) {
+ return preg_replace_callback(
+ MW_CHAR_REFS_REGEX,
+ array( 'Sanitizer', 'decodeCharReferencesCallback' ),
+ $text );
+ }
+
+ /**
+ * @param string $matches
+ * @return string
+ */
+ function decodeCharReferencesCallback( $matches ) {
+ if( $matches[1] != '' ) {
+ return Sanitizer::decodeEntity( $matches[1] );
+ } elseif( $matches[2] != '' ) {
+ return Sanitizer::decodeChar( intval( $matches[2] ) );
+ } elseif( $matches[3] != '' ) {
+ return Sanitizer::decodeChar( hexdec( $matches[3] ) );
+ } elseif( $matches[4] != '' ) {
+ return Sanitizer::decodeChar( hexdec( $matches[4] ) );
+ }
+ # Last case should be an ampersand by itself
+ return $matches[0];
+ }
+
+ /**
+ * Return UTF-8 string for a codepoint if that is a valid
+ * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
+ * @param int $codepoint
+ * @return string
+ * @access private
+ */
+ function decodeChar( $codepoint ) {
+ if( Sanitizer::validateCodepoint( $codepoint ) ) {
+ return codepointToUtf8( $codepoint );
+ } else {
+ return UTF8_REPLACEMENT;
+ }
+ }
+
+ /**
+ * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
+ * return the UTF-8 encoding of that character. Otherwise, returns
+ * pseudo-entity source (eg &foo;)
+ *
+ * @param string $name
+ * @return string
+ */
+ function decodeEntity( $name ) {
+ global $wgHtmlEntities;
+ if( isset( $wgHtmlEntities[$name] ) ) {
+ return codepointToUtf8( $wgHtmlEntities[$name] );
+ } else {
+ return "&$name;";
+ }
+ }
+
/**
* Fetch the whitelist of acceptable attributes for a given
* element name.
*/
#resolve entity-refs to look at attributes. may be harsh on big files... cache result?
- $chunk= wfMungeToUtf8($chunk); #this should actually use do_html_decode_entites, once this also deals with numeric entities.
+ $chunk = Sanitizer::decodeCharReferences( $chunk );
#look for script-types
if (preg_match("!type\s*=\s*['\"]?\s*(\w*/)?(ecma|java)!sim",$chunk)) return true;
}
/**
- * Convert things like é into real text...
+ * Convert things like é ā or 〗 into real text...
*/
- global $wgInputEncoding;
- $filteredText = do_html_entity_decode( $text, ENT_COMPAT, $wgInputEncoding );
-
- /**
- * Convert things like ā or 〗 into real text...
- * WARNING: Not friendly to internal links on a latin-1 wiki.
- */
- $filteredText = wfMungeToUtf8( $filteredText );
-
- # What was this for? TS 2004-03-03
- # $text = urldecode( $text );
+ $filteredText = Sanitizer::decodeCharReferences( $text );
$t =& new Title();
$t->mDbkeyform = str_replace( ' ', '_', $filteredText );
# Safari sends filenames in HTML-encoded Unicode form D...
# Horrid and evil! Let's try to make some kind of sense of it.
- $name = wfMungeToUtf8( $name );
+ $name = Sanitizer::decodeCharReferences( $name );
$name = UtfNormal::cleanUp( $name );
wfDebug( "WebRequest::getFileName() '" . $_FILES[$key]['name'] . "' normalized to '$name'\n" );
return $name;
}
}
- function testDecodeLatin() {
- $this->assertEquals(
- "\xe9cole",
- do_html_entity_decode( 'école', ENT_COMPAT, 'iso-8859-1' ) );
- }
-
- function testDecodeUnicode() {
- $this->assertEquals(
- "\xc3\xa9cole",
- do_html_entity_decode( 'école', ENT_COMPAT, 'utf-8' ) );
- }
-
function testRandom() {
# This could hypothetically fail, but it shouldn't ;)
$this->assertFalse(
wfUrlencode( "\xE7\x89\xB9\xE5\x88\xA5:Contributions/Foobar" ) );
}
- function testUtf8Sequence1() {
- $this->assertEquals(
- 'A',
- wfUtf8Sequence( 65 ) );
- }
-
- function testUtf8Sequence2() {
- $this->assertEquals(
- "\xc4\x88",
- wfUtf8Sequence( 0x108 ) );
- }
-
- function testUtf8Sequence3() {
- $this->assertEquals(
- "\xe3\x81\x8b",
- wfUtf8Sequence( 0x304b ) );
- }
-
- function testUtf8Sequence4() {
- $this->assertEquals(
- "\xf0\x90\x91\x90",
- wfUtf8Sequence( 0x10450 ) );
- }
-
- function testMungeToUtf8() {
- $this->assertEquals(
- "\xc4\x88io bonas dans l'\xc3\xa9cole!",
- wfMungeToUtf8( "Ĉio bonas dans l'école!" ) );
- }
-
- function testUtf8ToHTML() {
- $this->assertEquals(
- "Ĉio bonas dans l'école!",
- wfUtf8ToHTML( "\xc4\x88io bonas dans l'\xc3\xa9cole!" ) );
- }
-
function testReadOnlyEmpty() {
$this->assertFalse( wfReadOnly() );
}
'SearchMySQL3Test',
'SearchMySQL4Test',
'ArticleTest',
+ 'SanitizerTest',
);
foreach( $tests as $test ) {
require_once( $test . '.php' );
--- /dev/null
+<?php
+
+require_once( 'PHPUnit.php' );
+require_once( '../includes/Defines.php' );
+require_once( '../includes/Profiling.php' );
+require_once( '../includes/GlobalFunctions.php' );
+require_once( '../includes/Sanitizer.php' );
+
+class SanitizerTest extends PHPUnit_TestCase {
+ function SanitizerTest( $name ) {
+ $this->PHPUnit_TestCase( $name );
+ }
+
+ function setUp() {
+ }
+
+ function tearDown() {
+ }
+
+ function testDecodeNamed() {
+ $this->assertEquals(
+ "\xc3\xa9cole",
+ Sanitizer::decodeCharReferences( 'école' ) );
+ }
+
+ function testDecodeNumbered() {
+ $this->assertEquals(
+ "\xc4\x88io bonas dans l'\xc3\xa9cole!",
+ Sanitizer::decodeCharReferences( "Ĉio bonas dans l'école!" ) );
+ }
+
+ function testDecodeMixed() {
+ $this->assertEquals(
+ "\xc4\x88io bonas dans l'\xc3\xa9cole!",
+ Sanitizer::decodeCharReferences( "Ĉio bonas dans l'école!" ) );
+ }
+
+ function testDecodeMixedComplex() {
+ $this->assertEquals(
+ "\xc4\x88io bonas dans l'\xc3\xa9cole! (mais pas Ĉio dans l'école)",
+ Sanitizer::decodeCharReferences( "Ĉio bonas dans l'école! (mais pas &#x108;io dans l'&eacute;cole)" ) );
+ }
+
+ function testDecodeInvalidAmp() {
+ $this->assertEquals(
+ "a & b",
+ Sanitizer::decodeCharReferences( "a & b" ) );
+ }
+
+ function testDecodeInvalidNamed() {
+ $this->assertEquals(
+ "&foo;",
+ Sanitizer::decodeCharReferences( "&foo;" ) );
+ }
+
+ function testDecodeInvalidNumbered() {
+ $this->assertEquals(
+ UTF8_REPLACEMENT,
+ Sanitizer::decodeCharReferences( "�" ) );
+ }
+
+ /* TODO: many more! */
+}
+
+?>
\ No newline at end of file